Background

load biomart annotations

library(biomaRt)
biomaRt::listEnsemblArchives()
listMarts()

mart_aug2020 <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                         dataset = "hsapiens_gene_ensembl",
                         host = "http://aug2020.archive.ensembl.org")

head(listAttributes(mart_aug2020), n = 100)
grep( "entrez", listAttributes(mart_aug2020)$name,value = T)

t2g_aug2020 <- biomaRt::getBM(attributes = c("entrezgene_id","external_gene_name", "ensembl_gene_id", "ensembl_gene_id_version", "gene_biotype", "transcript_biotype","chromosome_name", "band", "transcript_length", "start_position", "end_position", "strand"), mart = mart_aug2020)
t2g_aug2020 <- dplyr::rename(t2g_aug2020,  entrez_gene = entrezgene_id, ext_gene = external_gene_name, ens_gene = ensembl_gene_id, ens_gene_ver = ensembl_gene_id_version)

save(list = c("t2g_aug2020", "mart_aug2020"), file = "/researchers/antonio.ahn/general/R/Download_resources/RData/mart_aug2020.RData")
load("/researchers/antonio.ahn/general/R/Download_resources/RData/mart_aug2020.RData")

# apr2019 is the last version to not have converged the lncRNA biotypes into just one group
# Starting from July2019, lncRNA biotypes are converged to a single "lncRNA" category.... why?? 

load("/researchers/antonio.ahn/general/R/Download_resources/RData/mart_apr2019.RData")

load data

fcounts <- read_tsv(file = "/researchers/krutika.ambani/Goel_lab_members/Keefe_Chan/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY/results/bamfiles/featurecounts_stranded_2/featurecounts_stranded_2.txt", skip = 1)

filtering

# 60656 genes (rows) and 48 samples (columns)
dim(fcounts_mat)
# 15111 genes with zero counts in all 12 samples
table(rowSums(fcounts_mat == 0) == 48)

# Here i chose to keep genes that have higher or equal to count than 5 in at least 3 or more samples (there are 3 samples in my smallest comparison group)
index <- rowSums(fcounts_mat > 5) >= 3
#index
#FALSE  TRUE 
#38707 21949  
table(index)

# This reduces the number of genes
# filtered matrix = fmat 
fcounts_Fmat <- fcounts_mat[index,]

# no duplications in the ENSG ID names
rownames(fcounts_Fmat) %>% duplicated %>% table
rownames(fcounts_mat) %>% duplicated %>% table
write_tsv(fcounts_Fmat %>% as_tibble(
rownames = "ensembl_gene_id"), "/researchers/krutika.ambani/Goel_lab_members/Keefe_Chan/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY/R_analysis/220714_intiial_QC_AA/1.QC_exploratory/data/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY_fcounts_Fmat.txt")
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 8 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

rlog normalisation (for Hclustering and PCA plots)

dds <- DESeqDataSetFromMatrix(fcounts_Fmat, colData = sample_information, design = ~ treatment)
# adding rowData. This is not required... dunno why i did this previously
table(rownames(dds) %in% t2g_aug2020$ens_gene_ver)
rowData(dds) <- data.frame(ens_gene_ver = rownames(dds), SYMBOL = t2g_aug2020$ext_gene[match(rownames(dds), t2g_aug2020$ens_gene_ver)])

rld <- rlog(dds) %>% assay

save(list = c("dds", "rld"), file = "/researchers/krutika.ambani/Goel_lab_members/Keefe_Chan/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY/R_analysis/220714_intiial_QC_AA/1.QC_exploratory/data/rld.RData")
load("/researchers/krutika.ambani/Goel_lab_members/Keefe_Chan/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY/R_analysis/220714_intiial_QC_AA/1.QC_exploratory/data/rld.RData")
# adding in the gene symbols instead of ENSG IDs
table(rownames(rld) %in% t2g_aug2020$ens_gene_ver)
## 
##  TRUE 
## 21949
rownames(rld) <- t2g_aug2020$ext_gene[match(rownames(rld), t2g_aug2020$ens_gene_ver)]

QC

non-filtered non-normalised

plot_grid(a1,a2)

filtered and non-normalised

plot_grid(a1,a2)

filtered and normalised for lib size

plot_grid(a1,a2)

rlog norm

plot_grid(a1,a2)

PCA plot

PCA plot - protein-coding

Preparing the data frame

# the top 50 genes that drives PC1. An arbritrary threshold needs to be applied to get the top genes. 
pca_results$rotation[,"PC1"] %>% sort(decreasing=TRUE) %>%  head(n=100)
##       SMC2      POC1A  TNFAIP8L1     SAPCD2      CENPI     FANCD2      GRB14 
## 0.03750883 0.03748773 0.03743238 0.03739268 0.03728621 0.03725676 0.03718276 
##     ERCC6L       SKP2     GAS2L3      PSRC1       SMC4      CDCA8      ZWINT 
## 0.03713609 0.03713383 0.03711240 0.03709288 0.03702653 0.03701940 0.03698839 
##      BUB1B       LIG1      DTYMK       NSD2      KIF15       H2AX       BUB1 
## 0.03696752 0.03696719 0.03694847 0.03694510 0.03692765 0.03692608 0.03691568 
##      CENPE      GPSM2      TACC3     RECQL4       PLK4      SPAG5       ASPM 
## 0.03691253 0.03689612 0.03689238 0.03688875 0.03687482 0.03686878 0.03686270 
##        NMU     KIF20B      CDC20       EME1      SYCE2      KIF14     MAD2L1 
## 0.03686112 0.03683519 0.03682917 0.03682406 0.03681882 0.03681704 0.03679000 
##      PRR11       SGO1  ARHGAP11A      DSCC1     ZNF732       CBX5       DHFR 
## 0.03677434 0.03675758 0.03673881 0.03670795 0.03670726 0.03670398 0.03667945 
##      CENPW      ACBD7      FANCB      CIP2A      FANCG     SHCBP1       OIP5 
## 0.03666311 0.03666023 0.03664010 0.03662291 0.03661729 0.03660820 0.03656629 
##       TPX2      GINS4        BLM       ANLN      CCNB2      SPC24      CENPN 
## 0.03656209 0.03655261 0.03653267 0.03653258 0.03652636 0.03651757 0.03651187 
##       PIF1      GTSE1        PBK     CDC25C      KIF11     H2AC20       MXD3 
## 0.03649454 0.03648110 0.03647684 0.03647548 0.03646893 0.03646545 0.03645719 
##       DLX3     RAD54L     PIMREG     DLGAP5       YBX2      ITGAE       PLK1 
## 0.03645201 0.03644690 0.03643359 0.03642469 0.03641981 0.03640065 0.03640036 
##      RADIL      CENPF      NEIL3 AC068831.6      KIFC1     TRIP13      RAD51 
## 0.03639542 0.03636182 0.03635094 0.03633139 0.03631553 0.03631423 0.03630197 
##       KNL1      DMBX1    METTL7A     CDCA7L       DNA2      KIF4A    RACGAP1 
## 0.03629742 0.03628134 0.03626367 0.03626030 0.03624971 0.03622757 0.03622683 
##     DEPDC1     TUBA1B        KHK      KIF23     KNSTRN       DPF1      BRCA2 
## 0.03622533 0.03622332 0.03620633 0.03619628 0.03615721 0.03613347 0.03612643 
##      ATAD2      CKS1B      BIRC5     CDKN2C        TK1       SKA3      PSIP1 
## 0.03611662 0.03611167 0.03608710 0.03608033 0.03607579 0.03606540 0.03605708 
##      CDCA3      SPC25 
## 0.03605316 0.03605126
pca_results$rotation[,"PC2"] %>% sort(decreasing=TRUE) %>%  head(n=100)
##     COL4A5    PCDHGB7      FRAS1       SDC2       THEG       MAOB        PGR 
## 0.09612437 0.09580860 0.09290137 0.09243446 0.08892456 0.08840027 0.08697228 
##      SYT13        MGP       SNCG     PCDHA7       OXTR    PCDHGA2      LYPD1 
## 0.08696684 0.08552496 0.08465830 0.08399895 0.08346819 0.08334863 0.08123307 
##     COL4A6      EPHA7     PCDHB3        CPQ     PCDHB2        VIM       AREG 
## 0.08084250 0.08054588 0.07982182 0.07968364 0.07968265 0.07822765 0.07798733 
##     PCDHA6     SPOCK1    SLCO2A1      RAB7B      MOXD1      WDR90      NELL2 
## 0.07724491 0.07565791 0.07534341 0.07493638 0.07460475 0.07369086 0.07168905 
##       KLK5       STC1   ARHGAP29    ST8SIA1     S100A2      CLDN9     ZNF365 
## 0.07119041 0.07116868 0.06969321 0.06956936 0.06878089 0.06861891 0.06855880 
##    PLEKHA4      GPR37       LGI2       ENC1   SLC38A11   KRTAP3-1      KCNJ8 
## 0.06812500 0.06730398 0.06582181 0.06580688 0.06576954 0.06496009 0.06478332 
##        ISX     ADAM19      CALCR     AMIGO2      KCNK2 ST6GALNAC4       LHX1 
## 0.06464701 0.06405167 0.06392250 0.06359578 0.06343412 0.06330482 0.06268644 
##       AASS   SERPINE1       KLK6      NPY1R       STUM      GSTM1     VSTM2L 
## 0.06231461 0.06116022 0.05997918 0.05924415 0.05887327 0.05841534 0.05838649 
##    FAM131B        TNC       CCN2    TMPRSS6     GASK1B     CADPS2    PCDHB13 
## 0.05815400 0.05798170 0.05777493 0.05719736 0.05639570 0.05606980 0.05598102 
##    UBE2QL1       CD22   SERPINA1      MEF2C    MAB21L4     PGM2L1      KRT16 
## 0.05475407 0.05395560 0.05388752 0.05369156 0.05350433 0.05341584 0.05289013 
##      CDH26      SPRR3       FBN2      CLCF1     FAM83E    NECTIN3    ADAMTS9 
## 0.05277602 0.05225556 0.05170139 0.05152021 0.05151048 0.05142750 0.05135598 
##      DUSP2       BMP7      MACC1    CNTNAP2     PHLDB2       IL18       ANK2 
## 0.05113602 0.05084087 0.05060155 0.05049239 0.05043758 0.05036053 0.05016012 
##      STAT6      MAP1B    AKR1B10     AKAP12       FMN2     RNF217      TIMP2 
## 0.04992152 0.04978040 0.04970297 0.04951616 0.04906101 0.04871050 0.04865692 
##       ASB9       TP63     LRRTM3    COL17A1       CDSN   SERPINA5       DNER 
## 0.04865247 0.04861034 0.04798018 0.04790406 0.04770761 0.04759981 0.04721390 
##      KRT20        WLS 
## 0.04707634 0.04679574
## 
## Attaching package: 'PCAtools'
## The following objects are masked from 'package:stats':
## 
##     biplot, screeplot
## -- removing the lower 10% of variables based on variance
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

PC3 protein-coding genes: further analysis

ggplot_gene_all("KRT20")

ggplot_gene_all("KRT4")

ggplot_gene_all("EMP1")

ggplot_gene_all("UGT1A6")

ggplot_gene_all("ABCA12")

ggplot_gene_all("DIO2")

ggplot_gene_all("ASB9")

ggplot_gene_all("IL6")

ggplot_gene_all("MMP1")

ggplot_gene_all("IL32")

ggplot_gene_all("PLAUR")

PC3_100genes <- pca_results$rotation[,"PC3"] %>% sort(decreasing=TRUE) %>%  head(n=100) %>% names

heatmap_mat <- norm_mat[PC3_100genes,]
# generate z-score
scaled.mat <- t(scale(t(heatmap_mat))) 

### annotate column
columnfactor <- sample_information %>% 
  unite(col = "group" , c(knockout,clone,treatment), sep = "_") %>% .$group


annotation_column <- data.frame(row.names = colnames(scaled.mat), group = columnfactor)
# 8 groups to color
annotation_column$group %>% unique %>% length
## [1] 16
### annotate rows.  no need here
#table(rownames(scaled.mat) == names(cut_output))
#annotation_row <- data.frame(row.names = names(rownames_cluster_new), cluster = #factor(paste0("c",rownames_cluster_new), levels = c(paste0("c",1:20))))

### annotate colors
# annotate colors: column
#startrek_col <- ggsci::pal_startrek()(7)[1:4]
#names(startrek_col) <- c("DMSO","DOX","LY","RAD")

#paired_color <- brewer.pal(8,"Paired")
#names(paired_color) <- unique(annotation_column$group)
#npg_col <- pal_npg()(8)[3:4] 
#names(npg_col) <- c("sig", "nonsig")
# annotate colors: rows
# row_colors <- c(brewer.pal(12,"Paired")[9:10], brewer.pal(8,"Dark2"))
column_colors <- c(brewer.pal(8,"Dark2"), ggsci::pal_simpsons()(8))
names(column_colors) <- unique(annotation_column$group)
#row_colors <-  colorRampPalette(c("blue", "purple","green", "white", "orange","red"))(20)
#names(row_colors) <- paste0("c",1:20)
#row_colors <- pals::stepped()[1:20]
#names(row_colors) <- paste0("c",1:20)

# annotate colors: put all together
anno_colors <- list(group = column_colors)

### other parameters
scaled.mat %>% max
## [1] 3.277022
scaled.mat %>% min
## [1] -3.158797
breaksList <- seq(-3, 3, length.out = 100)

# col.pan <- colorRampPalette(c("purple","black", "yellow"))(100)
col.pan <-  colorRampPalette(c("black","dodgerblue3", "white", "orange","red"))(100)
pheatmap::pheatmap(scaled.mat, col= col.pan, cluster_rows = F, cluster_cols = F, show_rownames = T, show_colnames=T, annotation_col = annotation_column, border_color = FALSE, legend = T,fontsize_row = 4,annotation_colors = anno_colors, treeheight_col = 20, breaks = breaksList, clustering_distance_rows = "euclidean",clustering_method = "complete")

hypergeometric test for PC3 genes

library(msigdbr)
library(clusterProfiler)
h_gene_sets = msigdbr(species = "Homo sapiens", category = "H")

c2_gene_sets = msigdbr(species = "Homo sapiens", category = "C2")
c2_gene_sets <- c2_gene_sets %>% filter(grepl("REACTOME|KEGG", c2_gene_sets$gs_name))

c5_gene_sets = msigdbr(species = "Homo sapiens", category = "C5")
c5_gene_sets <- c5_gene_sets %>% filter(!grepl("HP_", c5_gene_sets$gs_name))

MSigDB.ofinterest <- rbind(h_gene_sets, c2_gene_sets, c5_gene_sets)
MSigDB.ofinterest <- MSigDB.ofinterest  %>% dplyr::select(gs_name, gene_symbol)

PC3_hypergeo_output <-   enricher(PC3_100genes, TERM2GENE = MSigDB.ofinterest)
PC3_hypergeo_output %>% as_tibble %>% DT::datatable()

PCA plot - lncRNA

pca_results_lncRNA <- prcomp(t(rld_lncRNA[select,]), center =TRUE, scale = TRUE) 
# the top 50 genes that drives PC1. An arbritrary threshold needs to be applied to get the top genes. 
pca_results_lncRNA$rotation[,"PC1"] %>% sort(decreasing=TRUE) %>%  head(n=100)
##       SNHG19     NAPA-AS1        SNHG3   AL441992.1    LINC00997      PXN-AS1 
##   0.05510266   0.05485047   0.05484211   0.05462298   0.05425790   0.05409434 
##   PITPNA-AS1   AL513165.1   AC103746.1    LINC01963   AC137932.1        KIFC1 
##   0.05400740   0.05382320   0.05376251   0.05350812   0.05341298   0.05317138 
##   AC092718.4   AC010642.2   AP002360.1   AC009005.1        MINCR   AC091178.2 
##   0.05314825   0.05302811   0.05288654   0.05283861   0.05280396   0.05278254 
##    LINC02762        DANCR     MIR924HG   AL359399.1      PCCA-DT   AC012470.1 
##   0.05268556   0.05243747   0.05231806   0.05226991   0.05192394   0.05158491 
##   AL138966.2   ATP2A1-AS1    LINC01315   AP001505.1   AL356481.3   AC018645.3 
##   0.05155656   0.05150577   0.05149731   0.05147394   0.05143898   0.05143008 
##   AP002381.2   AC011603.3   AC125807.2        EMSLR    LINC02506   AC004540.2 
##   0.05141059   0.05124142   0.05118889   0.05108663   0.05089685   0.05088656 
##   AL133215.2   AC091057.1   AC069148.1   AL132857.1   AC020765.2   AC004540.1 
##   0.05086646   0.05064589   0.05044579   0.05032377   0.05005581   0.04984836 
##   AC002310.2 ADAMTS19-AS1   AC006026.3   AC008555.4   AL158212.3   AL136419.3 
##   0.04969167   0.04960804   0.04949045   0.04947325   0.04941449   0.04928650 
##    PRKG1-AS1     MIR600HG     SDK1-AS1   AC026401.3   AL589843.1   KCTD21-AS1 
##   0.04906646   0.04894267   0.04889751   0.04851915   0.04832020   0.04822915 
##        SNHG6   AC007106.1    LINC01135   AC012146.1        GLIDR   AC005746.2 
##   0.04818828   0.04817471   0.04811825   0.04805854   0.04793408   0.04786549 
##   SUCLG2-AS1   AC005746.3        FIRRE   ELOVL2-AS1       SNHG10   AC091563.1 
##   0.04771022   0.04764743   0.04760337   0.04714158   0.04680526   0.04670420 
##    DGUOK-AS1   ZNF710-AS1   ARRDC1-AS1   AC012615.1   AC136297.1   AC105046.1 
##   0.04644262   0.04644191   0.04574321   0.04570545   0.04557971   0.04443829 
##     TMPO-AS1   AC127070.1   AL031663.3     HAS2-AS1    CCDC84-DT    LINC01016 
##   0.04437070   0.04420862   0.04401237   0.04223360   0.04161276   0.04156050 
##   AC061992.2     U47924.2       SNHG21   AC108479.1    KDM4A-AS1   AL390294.1 
##   0.04145478   0.04100776   0.04084811   0.04076751   0.03826313   0.03774296 
##     MIR9-3HG   AC074135.1    MELTF-AS1    DDX11-AS1       TYMSOS   AC083967.1 
##   0.03735406   0.03556548   0.03506964   0.03410505   0.03407073   0.03291299 
##   AC138904.1   AL606500.1   AC132192.2   AL118516.1   AC010641.2   AC021134.1 
##   0.03244705   0.03236186   0.03179028   0.03093279   0.03089415   0.02943644 
##   AC105383.1    LINC00683      LHX1-DT   AC104663.1 
##   0.02860864   0.02758960   0.02345337   0.01112784
pca_results_lncRNA$rotation[,"PC2"] %>% sort(decreasing=TRUE) %>%  head(n=100)
##   AC025048.4   AC015912.3    RN7SL832P   AC008610.1   AC005520.5    FOXD2-AS1 
##   0.12494027   0.11973172   0.11972264   0.11707063   0.11445560   0.11064686 
##   AL122058.1   AL118516.1     EML2-AS1   AL807742.1   AC012640.1   AC011498.1 
##   0.10993891   0.10911781   0.10767850   0.10304868   0.10261978   0.10195122 
##   AC005363.2   AL357556.4   AC100791.1    LINC00052   AL021155.4   AC132192.2 
##   0.10125004   0.10033364   0.09746521   0.09737603   0.09736656   0.09717165 
##   AL121944.2   AC123768.1   AC022506.2        FALEC   AC100861.2   AC006460.1 
##   0.09687361   0.09672976   0.09595125   0.09513574   0.09285605   0.09147663 
##   AC010624.5   AC007365.1   AL606500.1     FLJ31104    LINC02235   AC111149.2 
##   0.08937272   0.08882696   0.08798254   0.08630194   0.08209807   0.08026034 
##   AC005005.4   AP001160.1   AC010978.1   AC025279.1   AL121936.1   AL109840.2 
##   0.08025740   0.08022511   0.07941137   0.07931164   0.07815328   0.07735621 
##    DDX11-AS1       SNHG21    LINC02605   AC007389.2    CCDC84-DT   AC116025.2 
##   0.07728164   0.07685225   0.07684033   0.07566794   0.07498532   0.07495090 
##    LINC02233     TMPO-AS1    KDM4A-AS1    MELTF-AS1   AC010327.7    LINC01345 
##   0.07478607   0.07400290   0.07359528   0.07350641   0.07329432   0.07327695 
##   AC044849.1     MIR9-3HG    LINC01517   AC108479.1     U47924.2       SNHG10 
##   0.07301511   0.07276381   0.07185499   0.07050353   0.07036831   0.06938289 
##   AL133370.1   AP003119.3   AC009093.4    LINC01476       TYMSOS      LHX1-DT 
##   0.06796496   0.06735050   0.06709791   0.06650317   0.06554956   0.06481574 
##   AC108134.2       LUCAT1    LINC02154    LINC02672         EGOT      LNCAROD 
##   0.06465599   0.06448604   0.06368642   0.06340953   0.06302292   0.06260716 
##   AC006076.1 CTC-338M12.4  NECTIN3-AS1   ARRDC1-AS1   DLGAP1-AS2   AC012615.1 
##   0.06240301   0.06096335   0.05971948   0.05950587   0.05891872   0.05876340 
##   AC136297.1        MANCR   AC107959.3   AC245041.2    LINC01484   AC026401.3 
##   0.05869429   0.05811877   0.05803273   0.05767723   0.05755152   0.05716911 
##   AC021218.1   AL357033.4   AL365181.3     CD27-AS1    LINC00944  LURAP1L-AS1 
##   0.05691210   0.05675830   0.05671149   0.05663044   0.05656194   0.05519646 
##   AP003119.2   AC092171.3    MIR4458HG   AC083967.1   AC083837.1   AL365181.2 
##   0.05515048   0.05492417   0.05489388   0.05428222   0.05418090   0.05392374 
##    LINC01629       FAM30A   AC027288.3   ZNF295-AS1   AL590666.2    LINC01285 
##   0.05382801   0.05376927   0.05357262   0.05311695   0.05291566   0.05266594 
##   AC092171.5    LINC02246    STAU2-AS1   AC026785.3 
##   0.05228642   0.05183273   0.05165222   0.05160941
a1 <- dplyr::filter(df.PCA_lncRNA_500) %>% 
ggplot(aes(x = PC1, y = PC2, col = treatment, shape = knockout)) +
  geom_point(size = 7) + 
  geom_text_repel(aes(label = samples), size = 3) +
#  scale_x_continuous(limits = c(-150, 150)) +
  scale_color_manual(values = col_npg5) +
  labs(shape = "sample", col = "treatment", 
       x = paste0("PC1","(",round(pca_lncRNA_results$sdev[1]^2/sum(pca_lncRNA_results$sdev^2)*100,1), "%)"), 
       y = paste0("PC2","(",round(pca_lncRNA_results$sdev[2]^2/sum(pca_lncRNA_results$sdev^2)*100,1), "%)")) +
  ggtitle("lncRNA (top 500 highest variance)") +
#  facet_grid( . ~ transformation) +
  theme_bw() +
  theme(axis.text = element_text(color = "black", size = 15), axis.title = element_text(color = "black", size = 20), plot.title = element_text(hjust=0.5))

a2 <- dplyr::filter(df.PCA_lncRNA_500) %>% 
ggplot(aes(x = PC1, y = PC3, col = treatment, shape = knockout)) +
  geom_point(size = 7) + 
  geom_text_repel(aes(label = samples), size = 3) +
#  scale_x_continuous(limits = c(-150, 150)) +
  scale_color_manual(values = col_npg5) +
  labs(shape = "sample", col = "treatment", x = paste0("PC1","(",round(pca_lncRNA_results$sdev[1]^2/sum(pca_lncRNA_results$sdev^2)*100,1), "%)"), 
       y = paste0("PC3","(",round(pca_lncRNA_results$sdev[3]^2/sum(pca_lncRNA_results$sdev^2)*100,1), "%)")) +
  ggtitle("lncRNA (top 500 highest variance)") +
#  facet_grid( . ~ transformation) +
  theme_bw() +
  theme(axis.text = element_text(color = "black", size = 15), axis.title = element_text(color = "black", size = 20), plot.title = element_text(hjust=0.5))

a3 <- dplyr::filter(df.PCA_lncRNA_500) %>% 
ggplot(aes(x = PC2, y = PC3, col = treatment, shape = knockout)) +
  geom_point(size = 7) + 
  geom_text_repel(aes(label = samples), size = 3) +
#  scale_x_continuous(limits = c(-150, 150)) +
  scale_color_manual(values = col_npg5) +
  labs(shape = "sample", col = "treatment", 
       x = paste0("PC2","(",round(pca_lncRNA_results$sdev[2]^2/sum(pca_lncRNA_results$sdev^2)*100,1), "%)"), 
       y = paste0("PC3","(",round(pca_lncRNA_results$sdev[3]^2/sum(pca_lncRNA_results$sdev^2)*100,1), "%)")) +
  ggtitle("lncRNA (top 500 highest variance)") +
#  facet_grid( . ~ transformation) +
  theme_bw() +
  theme(axis.text = element_text(color = "black", size = 15), axis.title = element_text(color = "black", size = 20), plot.title = element_text(hjust=0.5))

hierarchical cluster

a1

a2

save count matrix

save RData

save.image("/researchers/krutika.ambani/Goel_lab_members/Keefe_Chan/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY/R_analysis/220714_intiial_QC_AA/1.QC_exploratory/data/QC_exploratory.RData")

load("/researchers/krutika.ambani/Goel_lab_members/Keefe_Chan/220714_sgRB1_sgTP53_sgControl_DMSO_DOX_LY/R_analysis/220714_intiial_QC_AA/1.QC_exploratory/data/QC_exploratory.RData")